R packages to be used.

if (!require('dplyr')) install.packages('dplyr'); library('dplyr')
if (!require('tidyr')) install.packages('tidyr'); library('tidyr')
if (!require('lubridate')) install.packages('lubridate'); library('lubridate')
if (!require('tidyquant')) install.packages('tidyquant'); library('tidyquant')
if (!require('plotly')) install.packages('plotly'); library('plotly')
# Training
if (!require('caret')) install.packages('caret'); library('caret')
# Time Series (ARIMA)
if (!require('forecast')) install.packages('forecast'); library('forecast')
# Moving Average
if (!require('zoo')) install.packages('zoo'); library('zoo')

Data Import

df_country <- read.csv("./cases_malaysia.csv", header=TRUE)
# Convert date column & sort
df_country$date <- as.Date(df_country$date, format="%Y-%m-%d")
df_country <- df_country[order(df_country$date),]
# Filter data
df_data <- df_country[,c("date", "cases_new", "cases_active")]

Linear train

split_ratio <- 0.7
set.seed(168)
split_index <- createDataPartition(df_data$cases_new, p=split_ratio, list=FALSE)
data_train <- df_data[split_index,]
data_test <- df_data[-split_index,]

linear_model <- lm(cases_new~cases_active,data=data_train)
summary(linear_model)
## 
## Call:
## lm(formula = cases_new ~ cases_active, data = data_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3742.5  -215.4  -127.7   221.8  4264.9 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.286e+02  5.175e+01   2.485   0.0133 *  
## cases_active 8.244e-02  6.365e-04 129.506   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 948.8 on 494 degrees of freedom
## Multiple R-squared:  0.9714, Adjusted R-squared:  0.9713 
## F-statistic: 1.677e+04 on 1 and 494 DF,  p-value: < 2.2e-16
# linear_model <- train(cases_new~date, data=data_train, method="lm", trControl=data_train_control)

plot(linear_model)

linear_prediction <- linear_model %>% predict(data_test)
linear_compare <- data.frame(actual=data_test$cases_new, predicted=linear_prediction)
head(linear_compare)
##    actual predicted
## 1       4  128.9024
## 5       3  129.1497
## 8       0  129.2322
## 10      0  129.2322
## 14      1  129.7268
## 16      1  129.7268
data.frame(
  R2 = R2(linear_prediction, data_test$cases_new),
  RMSE = RMSE(linear_prediction, data_test$cases_new),
  MAE = MAE(linear_prediction, data_test$cases_new)
)
##          R2     RMSE      MAE
## 1 0.9721543 906.3865 545.7654
# Chart init
df_predicted <- data.frame(date=data_test$date, cases_new=linear_prediction)
df_actual <- data_test
df_train <- data_train

lm_chart <- plot_ly()
# Predicted Data
lm_chart <- lm_chart %>% 
  add_trace(
    x = df_predicted[["date"]], y = df_predicted[["cases_new"]],
    name = "Predicted Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'red', width = 3)
  )
# Test Data
lm_chart <- lm_chart %>% 
  add_trace(
    x = df_actual[["date"]], y = df_actual[["cases_new"]],
    name = "Actual Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'skyblue', width = 3)
  )

lm_chart <- lm_chart %>% 
  add_trace(
    x = df_train[["date"]], y = df_train[["cases_new"]], 
    name = "Train Data",
    type = "scatter",
    mode = "lines",
    line = list(color = 'green', width = 2)
  )

# Set figure title, x and y-axes titles
lm_chart <- lm_chart %>% layout(
  title = "Model Comparison of Daily New Cases",
  xaxis = list(title="Recorded Time"),
  yaxis = list(title="Daily Count of New Cases")
)%>%
  layout(plot_bgcolor='#e5ecf6',
          xaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff'),
          yaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff')
          )


lm_chart

Predict future new cases Using Predicted future active cases from ARIMA Model

init_year <- format(as.Date(df_data[1,1], format="%Y-%m-%d"),"%Y")
init_day <- yday(as.Date(df_data[1,1], format="%Y-%m-%d"))
data_arima <- ts(df_data$cases_active, start=c(init_year,init_day), frequency=365)
head(data_arima)
## Time Series:
## Start = c(2020, 25) 
## End = c(2020, 30) 
## Frequency = 365 
## [1] 4 4 4 4 7 8
arima_model <- auto.arima(df_data$cases_active, trace = TRUE, ic = 'aicc', approximation = FALSE, stepwise = FALSE)
## 
##  ARIMA(0,1,0)                    : 12574.82
##  ARIMA(0,1,0) with drift         : 12576.06
##  ARIMA(0,1,1)                    : 12201.06
##  ARIMA(0,1,1) with drift         : 12202.55
##  ARIMA(0,1,2)                    : 12000.65
##  ARIMA(0,1,2) with drift         : 12002.3
##  ARIMA(0,1,3)                    : 11949.82
##  ARIMA(0,1,3) with drift         : 11951.54
##  ARIMA(0,1,4)                    : 11892.69
##  ARIMA(0,1,4) with drift         : 11894.47
##  ARIMA(0,1,5)                    : 11833.57
##  ARIMA(0,1,5) with drift         : 11835.4
##  ARIMA(1,1,0)                    : 11789.02
##  ARIMA(1,1,0) with drift         : 11790.96
##  ARIMA(1,1,1)                    : 11637.72
##  ARIMA(1,1,1) with drift         : 11639.74
##  ARIMA(1,1,2)                    : 11636.15
##  ARIMA(1,1,2) with drift         : 11638.18
##  ARIMA(1,1,3)                    : 11637.46
##  ARIMA(1,1,3) with drift         : 11639.49
##  ARIMA(1,1,4)                    : 11612.64
##  ARIMA(1,1,4) with drift         : 11614.67
##  ARIMA(2,1,0)                    : 11710.42
##  ARIMA(2,1,0) with drift         : 11712.4
##  ARIMA(2,1,1)                    : 11636.58
##  ARIMA(2,1,1) with drift         : 11638.61
##  ARIMA(2,1,2)                    : 11638.07
##  ARIMA(2,1,2) with drift         : 11640.11
##  ARIMA(2,1,3)                    : Inf
##  ARIMA(2,1,3) with drift         : 11640.24
##  ARIMA(3,1,0)                    : 11684.41
##  ARIMA(3,1,0) with drift         : 11686.42
##  ARIMA(3,1,1)                    : 11636.13
##  ARIMA(3,1,1) with drift         : 11638.16
##  ARIMA(3,1,2)                    : Inf
##  ARIMA(3,1,2) with drift         : Inf
##  ARIMA(4,1,0)                    : 11635.54
##  ARIMA(4,1,0) with drift         : 11637.56
##  ARIMA(4,1,1)                    : 11623.02
##  ARIMA(4,1,1) with drift         : 11625.06
##  ARIMA(5,1,0)                    : 11626.8
##  ARIMA(5,1,0) with drift         : 11628.84
## 
## 
## 
##  Best model: ARIMA(1,1,4)
arima_model
## Series: df_data$cases_active 
## ARIMA(1,1,4) 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3     ma4
##       0.9737  -0.6159  -0.1013  -0.0863  0.2176
## s.e.  0.0090   0.0383   0.0443   0.0453  0.0415
## 
## sigma^2 estimated as 785818:  log likelihood=-5800.26
## AIC=11612.52   AICc=11612.64   BIC=11639.89
forecast_length <- 30
arima_predict <- forecast(arima_model, forecast_length)
head(arima_predict)
## $method
## [1] "ARIMA(1,1,4)"
## 
## $model
## Series: df_data$cases_active 
## ARIMA(1,1,4) 
## 
## Coefficients:
##          ar1      ma1      ma2      ma3     ma4
##       0.9737  -0.6159  -0.1013  -0.0863  0.2176
## s.e.  0.0090   0.0383   0.0443   0.0453  0.0415
## 
## sigma^2 estimated as 785818:  log likelihood=-5800.26
## AIC=11612.52   AICc=11612.64   BIC=11639.89
## 
## $level
## [1] 80 95
## 
## $mean
## Time Series:
## Start = 709 
## End = 738 
## Frequency = 1 
##  [1] 40614.16 40307.63 40048.57 39882.01 39719.83 39561.92 39408.15 39258.42
##  [9] 39112.62 38970.65 38832.42 38697.81 38566.74 38439.11 38314.84 38193.83
## [17] 38076.00 37961.26 37849.54 37740.75 37634.82 37531.67 37431.23 37333.44
## [25] 37238.21 37145.48 37055.18 36967.26 36881.65 36798.29
## 
## $lower
## Time Series:
## Start = 709 
## End = 738 
## Frequency = 1 
##           80%         95%
## 709 39478.114  38876.7256
## 710 38391.848  37377.6936
## 711 37403.746  36003.6634
## 712 36566.876  34811.9487
## 713 35617.312  33445.5671
## 714 34581.553  31945.1080
## 715 33477.514  30338.0259
## 716 32317.611  28643.3705
## 717 31110.863  26874.9883
## 718 29864.124  25043.4190
## 719 28582.818  23157.0088
## 720 27271.371  21222.5802
## 721 25933.490  19245.8519
## 722 24572.339  17231.7126
## 723 23190.662  15184.4066
## 724 21790.866  13107.6638
## 725 20375.089  11004.7946
## 726 18945.235   8878.7600
## 727 17503.021   6732.2261
## 728 16049.997   4567.6059
## 729 14587.570   2387.0935
## 730 13117.025    192.6909
## 731 11639.533  -2013.7690
## 732 10156.171  -4230.6039
## 733  8667.926  -6456.2660
## 734  7175.708  -8689.3286
## 735  5680.356 -10928.4742
## 736  4182.644 -13172.4849
## 737  2683.288 -15420.2328
## 738  1182.949 -17670.6729
## 
## $upper
## Time Series:
## Start = 709 
## End = 738 
## Frequency = 1 
##          80%      95%
## 709 41750.21 42351.60
## 710 42223.42 43237.57
## 711 42693.38 44093.47
## 712 43197.15 44952.08
## 713 43822.36 45994.10
## 714 44542.28 47178.73
## 715 45338.78 48478.27
## 716 46199.22 49873.46
## 717 47114.38 51350.25
## 718 48077.18 52897.89
## 719 49082.01 54507.82
## 720 50124.25 56173.04
## 721 51199.99 57887.63
## 722 52305.88 59646.51
## 723 53439.01 61445.27
## 724 54596.79 63279.99
## 725 55776.90 65147.20
## 726 56977.28 67043.76
## 727 58196.05 68966.85
## 728 59431.50 70913.89
## 729 60682.07 72882.55
## 730 61946.32 74870.65
## 731 63222.94 76876.24
## 732 64510.70 78897.48
## 733 65808.48 80932.68
## 734 67115.25 82980.28
## 735 68430.01 85038.84
## 736 69751.88 87107.01
## 737 71080.02 89183.54
## 738 72413.63 91267.25
plot(arima_predict, main = "Predicted Active Cases", col.main = "black")

last_date <- as.Date(df_data[(nrow(df_data)):nrow(df_data),1], format="%Y-%m-%d")
last_date <- last_date + 1
df_arima <- data.frame(
  date=seq(last_date, by = "day", length.out = forecast_length),
  cases_active=arima_predict$mean
)

predict(linear_model, newdata=df_arima)
##        1        2        3        4        5        6        7        8 
## 3476.616 3451.347 3429.991 3416.261 3402.892 3389.874 3377.197 3364.854 
##        9       10       11       12       13       14       15       16 
## 3352.836 3341.133 3329.737 3318.641 3307.836 3297.315 3287.070 3277.095 
##       17       18       19       20       21       22       23       24 
## 3267.381 3257.923 3248.713 3239.745 3231.013 3222.510 3214.230 3206.168 
##       25       26       27       28       29       30 
## 3198.318 3190.673 3183.230 3175.982 3168.925 3162.053
combined_prediction <- linear_model %>% predict(df_arima)
df_combined_predicted <- data.frame(date=df_arima$date, cases_new=combined_prediction)
df_data$month <- strftime(df_data$date, "%m")
df_data$year <- strftime(df_data$date, "%Y")

df_smooth <- df_data %>%                         
  group_by(date=lubridate::floor_date(df_data$date, "month")) %>% 
  dplyr::summarize(cases_new = mean(cases_new)) %>% 
  data.frame
combined_chart <- plot_ly()
# Predicted Data
combined_chart <- combined_chart %>% 
  add_trace(
    x = df_combined_predicted[["date"]], y = df_combined_predicted[["cases_new"]],
    name = "Future Predicted Data",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'red', width = 3)
  )
# Test Data
combined_chart <- combined_chart %>% 
  add_trace(
    x = df_smooth[["date"]], y = df_smooth[["cases_new"]],
    name = "Actual Data (Rolled to Monthly)",
    type = 'scatter',
    mode = 'lines',
    line = list(color = 'skyblue', width = 3)
  )
# Set figure title, x and y-axes titles
combined_chart <- combined_chart %>% layout(
  title = "Prediction of Daily New Cases",
  xaxis = list(title="Recorded Time"),
  yaxis = list(title="Daily Count of New Cases")
)%>%
  layout(plot_bgcolor='#e5ecf6',
          xaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff'),
          yaxis = list(
            zerolinecolor = '#ffff',
            zerolinewidth = 2,
            gridcolor = 'ffff')
          )

combined_chart

End of document